/*******************************************************************************
* Copyright 2019 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "common_f32.hpp"
#include "jit_generator.hpp"

namespace mkldnn {
namespace impl {
namespace cpu {

jit_sse41_kernel_b0_sgemm_kern::jit_sse41_kernel_b0_sgemm_kern() :
    jit_generator(nullptr, F32_COMPUTE_KERNEL_CODE_SIZE) {

#ifndef _WIN32

#define M   rdi
#define N   rsi
#define K   rdx
#define A   r8
#define B   r9
#define C   rcx
#define LDC r10

#define AA  r15
#define I   r11
#define J   r12
#define H   rax
#define AO  rbx
#define BO  rbp
#define CO1 r13
#define CO2 r14

#define OLD_C       8+stacksize+rsp
#define OLD_LDC     16+stacksize+rsp

#else

#define M   rcx
#define N   rdx
#define K   r8
#define A   rdi
#define B   rsi
#define C   r9
#define LDC r10
#define AA  r15
#define I   r11
#define J   r12
#define H   rax
#define AO  rbx
#define BO  rbp
#define CO1 r13
#define CO2 r14

#define OLD_A       40+stacksize+rsp
#define OLD_B       48+stacksize+rsp
#define OLD_C       56+stacksize+rsp
#define OLD_LDC     64+stacksize+rsp

#endif

inLocalLabel();
{

Xbyak::Label l1090;
Xbyak::Label l10ac;
Xbyak::Label l11a4;
Xbyak::Label l11b0;
Xbyak::Label l11fc;
Xbyak::Label l1264;
Xbyak::Label l12c4;
Xbyak::Label l13c0;
Xbyak::Label l13d0;
Xbyak::Label l14cc;
Xbyak::Label l14d8;
Xbyak::Label l1528;
Xbyak::Label l1568;
Xbyak::Label l15c8;
Xbyak::Label l16c4;
Xbyak::Label l16d0;
Xbyak::Label l17cc;
Xbyak::Label l17d8;
Xbyak::Label l1828;
Xbyak::Label l1860;
Xbyak::Label l1864;
Xbyak::Label l1894;
Xbyak::Label l18e8;
Xbyak::Label l19e4;
Xbyak::Label l1a00;
Xbyak::Label l1afc;
Xbyak::Label l1b08;
Xbyak::Label l1b58;
Xbyak::Label l1bc0;
Xbyak::Label l1c24;
Xbyak::Label l1d24;
Xbyak::Label l1d34;
Xbyak::Label l1e34;
Xbyak::Label l1e40;
Xbyak::Label l1e90;
Xbyak::Label l1ed0;
Xbyak::Label l1f34;
Xbyak::Label l2034;
Xbyak::Label l2040;
Xbyak::Label l2140;
Xbyak::Label l214c;
Xbyak::Label l219c;
Xbyak::Label l21d4;
Xbyak::Label l21d8;
Xbyak::Label l2208;
Xbyak::Label l225c;
Xbyak::Label l2358;
Xbyak::Label l2370;
Xbyak::Label l246c;
Xbyak::Label l2478;
Xbyak::Label l24c8;
Xbyak::Label l2534;
Xbyak::Label l2598;
Xbyak::Label l2698;
Xbyak::Label l26a8;
Xbyak::Label l27a8;
Xbyak::Label l27b4;
Xbyak::Label l27c;
Xbyak::Label l2804;
Xbyak::Label l2844;
Xbyak::Label l28a8;
Xbyak::Label l298;
Xbyak::Label l29a8;
Xbyak::Label l29b4;
Xbyak::Label l2ab4;
Xbyak::Label l2ac0;
Xbyak::Label l2b10;
Xbyak::Label l2b4c;
Xbyak::Label l2b50;
Xbyak::Label l444;
Xbyak::Label l450;
Xbyak::Label l4bc;
Xbyak::Label l50;
Xbyak::Label l578;
Xbyak::Label l5e4;
Xbyak::Label l74;
Xbyak::Label l794;
Xbyak::Label l7a4;
Xbyak::Label l954;
Xbyak::Label l960;
Xbyak::Label l9d0;
Xbyak::Label la44;
Xbyak::Label lab0;
Xbyak::Label lc60;
Xbyak::Label lc6c;
Xbyak::Label ld0;
Xbyak::Label le1c;
Xbyak::Label le28;
Xbyak::Label le98;
Xbyak::Label lf00;
Xbyak::Label lf14;
Xbyak::Label lf44;
Xbyak::Label lf98;

    preamble();
    auto stacksize = get_size_of_abi_save_regs();
#ifdef _WIN32
    mov(A, ptr[OLD_A]);
    mov(B, ptr[OLD_B]);
#endif
    mov(C, ptr[OLD_C]);
    mov(LDC, ptr[OLD_LDC]);

    mov(M, qword[M]);
    mov(N, qword[N]);
    mov(K, qword[K]);
    shl(LDC, 0x2);
    sub(A, -128);
    sub(B, -128);
    mov(J, M);
    cmp(J, 0x8);
    jl(lf14, T_NEAR);
    align(4);

L(l50);
    mov(AA, K);
    imul(AA, AA, 0x20);
    add(AA, A);
    mov(CO1, C);
    add(C, 0x20);
    mov(BO, B);
    mov(I, N);
    cmp(I, 0x4);
    jl(l578, T_NEAR);
    align(4);

L(l74);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    movups(xmm1, xword[A-0x70]);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x60]);
    xorps(xmm10, xmm10);
    movups(xmm3, xword[A-0x50]);
    xorps(xmm11, xmm11);
    movaps(xmm4, xword[BO-0x80]);
    xorps(xmm12, xmm12);
    movaps(xmm5, xword[BO-0x70]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l444, T_NEAR);
    sub(H, 0x1e);
    jle(l27c, T_NEAR);
    align(4);

L(ld0);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -64);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(ld0, T_NEAR);
    align(4);

L(l27c);
    prefetcht0(byte[CO1+0x1c]);
    prefetcht0(byte[CO1+LDC*1+0x1c]);
    prefetcht0(byte[CO2+0x1c]);
    prefetcht0(byte[CO2+LDC*1+0x1c]);
    add(H, 0x1e);
    align(4);

L(l298);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -64);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l298, T_NEAR);
    align(4);

L(l444);
    mov(H, K);
    and_(H, 0x3);
    je(l4bc, T_NEAR);
    align(4);

L(l450);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x50]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    sub(AO, -32);
    sub(BO, -16);
    dec(H);
    jg(l450, T_NEAR);
    align(4);

L(l4bc);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm10, xmm1);
    movaps(xmm11, xmm1);
    shufps(xmm10, xmm0, 0xcc);
    shufps(xmm11, xmm0, 0x66);
    movaps(xmm0, xmm12);
    unpcklpd(xmm12, xmm13);
    unpckhpd(xmm0, xmm13);
    movaps(xmm1, xmm14);
    unpckhpd(xmm14, xmm15);
    unpcklpd(xmm1, xmm15);
    movaps(xmm13, xmm12);
    shufps(xmm12, xmm14, 0xcc);
    shufps(xmm13, xmm14, 0x66);
    movaps(xmm14, xmm1);
    movaps(xmm15, xmm1);
    shufps(xmm14, xmm0, 0xcc);
    shufps(xmm15, xmm0, 0x66);
    movups(xword[CO1+0x0], xmm8);
    movups(xword[CO1+0x10], xmm12);
    movups(xword[CO1+LDC*1+0x0], xmm9);
    movups(xword[CO1+LDC*1+0x10], xmm13);
    movups(xword[CO2], xmm10);
    movups(xword[CO2+0x10], xmm14);
    movups(xword[CO2+LDC*1], xmm11);
    movups(xword[CO2+LDC*1+0x10], xmm15);
    lea(CO1, ptr[CO1+LDC*4+0x0]);
    lea(CO2, ptr[CO2+LDC*4]);
    sub(I, 0x4);
    cmp(I, 0x4);
    jge(l74, T_NEAR);
    align(4);

L(l578);
    test(I, 0x2);
    jle(la44, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    movups(xmm1, xword[A-0x70]);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x60]);
    xorps(xmm10, xmm10);
    movups(xmm3, xword[A-0x50]);
    xorps(xmm11, xmm11);
    movddup(xmm4, qword[BO-0x80]);
    xorps(xmm12, xmm12);
    movddup(xmm5, qword[BO-0x78]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l954, T_NEAR);
    sub(H, 0x1e);
    jle(l794, T_NEAR);
    align(4);

L(l5e4);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -32);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l5e4, T_NEAR);
    align(4);

L(l794);
    prefetcht0(byte[CO1+0x1c]);
    prefetcht0(byte[CO1+LDC*1+0x1c]);
    add(H, 0x1e);
    align(4);

L(l7a4);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -32);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l7a4, T_NEAR);
    align(4);

L(l954);
    mov(H, K);
    and_(H, 0x3);
    je(l9d0, T_NEAR);
    align(4);

L(l960);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x78]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x50]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    sub(AO, -32);
    sub(BO, -8);
    dec(H);
    jg(l960, T_NEAR);
    align(4);

L(l9d0);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm0, xmm12);
    unpcklpd(xmm12, xmm13);
    unpckhpd(xmm0, xmm13);
    movaps(xmm1, xmm14);
    unpckhpd(xmm14, xmm15);
    unpcklpd(xmm1, xmm15);
    movaps(xmm13, xmm12);
    shufps(xmm12, xmm14, 0xcc);
    shufps(xmm13, xmm14, 0x66);
    movups(xword[CO1+0x0], xmm8);
    movups(xword[CO1+0x10], xmm12);
    movups(xword[CO1+LDC*1+0x0], xmm9);
    movups(xword[CO1+LDC*1+0x10], xmm13);
    lea(CO1, ptr[CO1+LDC*2+0x0]);
    lea(CO2, ptr[CO2+LDC*2]);
    align(4);

L(la44);
    test(I, 0x1);
    jle(lf00, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    movups(xmm1, xword[A-0x70]);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x60]);
    xorps(xmm10, xmm10);
    movups(xmm3, xword[A-0x50]);
    xorps(xmm11, xmm11);
    movss(xmm4, dword[BO-0x80]);
    xorps(xmm12, xmm12);
    movss(xmm5, dword[BO-0x7c]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(le1c, T_NEAR);
    sub(H, 0x1e);
    jle(lc60, T_NEAR);
    align(4);

L(lab0);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -16);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(lab0, T_NEAR);
    align(4);

L(lc60);
    prefetcht0(byte[CO1+0x1c]);
    add(H, 0x1e);
    align(4);

L(lc6c);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x30]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO-0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    prefetcht0(byte[AO+0x1c0]);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO+0x10]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm8, xmm5);
    addps(xmm12, xmm7);
    pshufd(xmm5, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm5, 0xb1);
    movaps(xmm7, xmm5);
    mulps(xmm5, xmm2);
    mulps(xmm7, xmm3);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    addps(xmm14, xmm7);
    add(AA, 0x8);
    sub(BO, -16);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO+0x20]);
    mulps(xmm7, xmm3);
    movups(xmm3, xword[AO+0x30]);
    sub(AO, -128);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(lc6c, T_NEAR);
    align(4);

L(le1c);
    mov(H, K);
    and_(H, 0x3);
    je(le98, T_NEAR);
    align(4);

L(le28);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm8, xmm4);
    addps(xmm12, xmm7);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm9, xmm6);
    addps(xmm13, xmm7);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    mulps(xmm7, xmm1);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x7c]);
    addps(xmm14, xmm7);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    mulps(xmm7, xmm1);
    movups(xmm1, xword[AO-0x50]);
    addps(xmm11, xmm6);
    addps(xmm15, xmm7);
    sub(AO, -32);
    sub(BO, -4);
    dec(H);
    jg(le28, T_NEAR);
    align(4);

L(le98);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm0, xmm12);
    unpcklpd(xmm12, xmm13);
    unpckhpd(xmm0, xmm13);
    movaps(xmm1, xmm14);
    unpckhpd(xmm14, xmm15);
    unpcklpd(xmm1, xmm15);
    movaps(xmm13, xmm12);
    shufps(xmm12, xmm14, 0xcc);
    shufps(xmm13, xmm14, 0x66);
    movups(xword[CO1+0x0], xmm8);
    movups(xword[CO1+0x10], xmm12);
    lea(CO1, ptr[CO1+LDC*1+0x0]);
    lea(CO2, ptr[CO2+LDC*1]);
    align(4);

L(lf00);
    mov(A, AO);
    sub(J, 0x8);
    cmp(J, 0x8);
    jge(l50, T_NEAR);
    align(4);

L(lf14);
    test(J, 0x4);
    jle(l1864, T_NEAR);
    mov(AA, K);
    imul(AA, AA, 0x10);
    add(AA, A);
    mov(CO1, C);
    add(C, 0x10);
    mov(BO, B);
    mov(I, N);
    cmp(I, 0x4);
    jl(l1264, T_NEAR);
    align(4);

L(lf44);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x70]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movaps(xmm4, xword[BO-0x80]);
    xorps(xmm12, xmm12);
    movaps(xmm5, xword[BO-0x70]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l11a4, T_NEAR);
    sub(H, 0x1e);
    jle(l1090, T_NEAR);
    align(4);

L(lf98);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(lf98, T_NEAR);
    align(4);

L(l1090);
    prefetcht0(byte[CO1+0xc]);
    prefetcht0(byte[CO1+LDC*1+0xc]);
    prefetcht0(byte[CO2+0xc]);
    prefetcht0(byte[CO2+LDC*1+0xc]);
    add(H, 0x1e);
    align(4);

L(l10ac);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l10ac, T_NEAR);
    align(4);

L(l11a4);
    mov(H, K);
    and_(H, 0x3);
    je(l11fc, T_NEAR);
    align(4);

L(l11b0);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x70]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x70]);
    addps(xmm11, xmm6);
    sub(AO, -16);
    sub(BO, -16);
    dec(H);
    jg(l11b0, T_NEAR);
    align(4);

L(l11fc);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm10, xmm1);
    movaps(xmm11, xmm1);
    shufps(xmm10, xmm0, 0xcc);
    shufps(xmm11, xmm0, 0x66);
    movups(xword[CO1+0x0], xmm8);
    movups(xword[CO1+LDC*1+0x0], xmm9);
    movups(xword[CO2], xmm10);
    movups(xword[CO2+LDC*1], xmm11);
    lea(CO1, ptr[CO1+LDC*4+0x0]);
    lea(CO2, ptr[CO2+LDC*4]);
    sub(I, 0x4);
    cmp(I, 0x4);
    jge(lf44, T_NEAR);
    align(4);

L(l1264);
    test(I, 0x2);
    jle(l1568, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x70]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movddup(xmm4, qword[BO-0x80]);
    xorps(xmm12, xmm12);
    movddup(xmm5, qword[BO-0x78]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l14cc, T_NEAR);
    sub(H, 0x1e);
    jle(l13c0, T_NEAR);
    align(4);

L(l12c4);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l12c4, T_NEAR);
    align(4);

L(l13c0);
    prefetcht0(byte[CO1+0xc]);
    prefetcht0(byte[CO1+LDC*1+0xc]);
    add(H, 0x1e);
    align(4);

L(l13d0);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l13d0, T_NEAR);
    align(4);

L(l14cc);
    mov(H, K);
    and_(H, 0x3);
    je(l1528, T_NEAR);
    align(4);

L(l14d8);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x78]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x70]);
    addps(xmm11, xmm6);
    sub(AO, -16);
    sub(BO, -8);
    dec(H);
    jg(l14d8, T_NEAR);
    align(4);

L(l1528);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movups(xword[CO1+0x0], xmm8);
    movups(xword[CO1+LDC*1+0x0], xmm9);
    lea(CO1, ptr[CO1+LDC*2+0x0]);
    lea(CO2, ptr[CO2+LDC*2]);
    align(4);

L(l1568);
    test(I, 0x1);
    jle(l1860, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movups(xmm0, xword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movups(xmm2, xword[A-0x70]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movss(xmm4, dword[BO-0x80]);
    xorps(xmm12, xmm12);
    movss(xmm5, dword[BO-0x7c]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l17cc, T_NEAR);
    sub(H, 0x1e);
    jle(l16c4, T_NEAR);
    align(4);

L(l15c8);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l15c8, T_NEAR);
    align(4);

L(l16c4);
    prefetcht0(byte[CO1+0xc]);
    add(H, 0x1e);
    align(4);

L(l16d0);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x50]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x40]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movups(xmm2, xword[AO-0x30]);
    sub(AO, -64);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l16d0, T_NEAR);
    align(4);

L(l17cc);
    mov(H, K);
    and_(H, 0x3);
    je(l1828, T_NEAR);
    align(4);

L(l17d8);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x7c]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movups(xmm0, xword[AO-0x70]);
    addps(xmm11, xmm6);
    sub(AO, -16);
    sub(BO, -4);
    dec(H);
    jg(l17d8, T_NEAR);
    align(4);

L(l1828);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movups(xword[CO1+0x0], xmm8);
    lea(CO1, ptr[CO1+LDC*1+0x0]);
    lea(CO2, ptr[CO2+LDC*1]);
    align(4);

L(l1860);
    mov(A, AO);
    align(4);

L(l1864);
    test(J, 0x2);
    jle(l21d8, T_NEAR);
    mov(AA, K);
    imul(AA, AA, 0x8);
    add(AA, A);
    mov(CO1, C);
    add(C, 0x8);
    mov(BO, B);
    mov(I, N);
    cmp(I, 0x4);
    jl(l1bc0, T_NEAR);
    align(4);

L(l1894);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movsd(xmm0, qword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movsd(xmm2, qword[A-0x78]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movaps(xmm4, xword[BO-0x80]);
    xorps(xmm12, xmm12);
    movaps(xmm5, xword[BO-0x70]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l1afc, T_NEAR);
    sub(H, 0x1e);
    jle(l19e4, T_NEAR);
    align(4);

L(l18e8);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l18e8, T_NEAR);
    align(4);

L(l19e4);
    prefetcht0(byte[CO1+0x4]);
    prefetcht0(byte[CO1+LDC*1+0x4]);
    prefetcht0(byte[CO2+0x4]);
    prefetcht0(byte[CO2+LDC*1+0x4]);
    add(H, 0x1e);
    align(4);

L(l1a00);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1a00, T_NEAR);
    align(4);

L(l1afc);
    mov(H, K);
    and_(H, 0x3);
    je(l1b58, T_NEAR);
    align(4);

L(l1b08);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x70]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x78]);
    addps(xmm11, xmm6);
    sub(AO, -8);
    sub(BO, -16);
    dec(H);
    jg(l1b08, T_NEAR);
    align(4);

L(l1b58);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm10, xmm1);
    movaps(xmm11, xmm1);
    shufps(xmm10, xmm0, 0xcc);
    shufps(xmm11, xmm0, 0x66);
    movlps(qword[CO1+0x0], xmm8);
    movlps(qword[CO1+LDC*1+0x0], xmm9);
    movlps(qword[CO2], xmm10);
    movlps(qword[CO2+LDC*1], xmm11);
    lea(CO1, ptr[CO1+LDC*4+0x0]);
    lea(CO2, ptr[CO2+LDC*4]);
    sub(I, 0x4);
    cmp(I, 0x4);
    jge(l1894, T_NEAR);
    align(4);

L(l1bc0);
    test(I, 0x2);
    jle(l1ed0, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movsd(xmm0, qword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movsd(xmm2, qword[A-0x78]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movddup(xmm4, qword[BO-0x80]);
    xorps(xmm12, xmm12);
    movddup(xmm5, qword[BO-0x78]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l1e34, T_NEAR);
    sub(H, 0x1e);
    jle(l1d24, T_NEAR);
    align(4);

L(l1c24);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1c24, T_NEAR);
    align(4);

L(l1d24);
    prefetcht0(byte[CO1+0x4]);
    prefetcht0(byte[CO1+LDC*1+0x4]);
    add(H, 0x1e);
    align(4);

L(l1d34);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1d34, T_NEAR);
    align(4);

L(l1e34);
    mov(H, K);
    and_(H, 0x3);
    je(l1e90, T_NEAR);
    align(4);

L(l1e40);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x78]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x78]);
    addps(xmm11, xmm6);
    sub(AO, -8);
    sub(BO, -8);
    dec(H);
    jg(l1e40, T_NEAR);
    align(4);

L(l1e90);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movlps(qword[CO1+0x0], xmm8);
    movlps(qword[CO1+LDC*1+0x0], xmm9);
    lea(CO1, ptr[CO1+LDC*2+0x0]);
    lea(CO2, ptr[CO2+LDC*2]);
    align(4);

L(l1ed0);
    test(I, 0x1);
    jle(l21d4, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movsd(xmm0, qword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movsd(xmm2, qword[A-0x78]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movss(xmm4, dword[BO-0x80]);
    xorps(xmm12, xmm12);
    movss(xmm5, dword[BO-0x7c]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l2140, T_NEAR);
    sub(H, 0x1e);
    jle(l2034, T_NEAR);
    align(4);

L(l1f34);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l1f34, T_NEAR);
    align(4);

L(l2034);
    prefetcht0(byte[CO1+0x4]);
    add(H, 0x1e);
    align(4);

L(l2040);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x68]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x60]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movsd(xmm2, qword[AO-0x58]);
    sub(AO, -32);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l2040, T_NEAR);
    align(4);

L(l2140);
    mov(H, K);
    and_(H, 0x3);
    je(l219c, T_NEAR);
    align(4);

L(l214c);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x7c]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movsd(xmm0, qword[AO-0x78]);
    addps(xmm11, xmm6);
    sub(AO, -8);
    sub(BO, -4);
    dec(H);
    jg(l214c, T_NEAR);
    align(4);

L(l219c);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movlps(qword[CO1+0x0], xmm8);
    lea(CO1, ptr[CO1+LDC*1+0x0]);
    lea(CO2, ptr[CO2+LDC*1]);
    align(4);

L(l21d4);
    mov(A, AO);
    align(4);

L(l21d8);
    test(J, 0x1);
    jle(l2b50, T_NEAR);
    mov(AA, K);
    imul(AA, AA, 0x4);
    add(AA, A);
    mov(CO1, C);
    add(C, 0x4);
    mov(BO, B);
    mov(I, N);
    cmp(I, 0x4);
    jl(l2534, T_NEAR);
    align(4);

L(l2208);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movss(xmm0, dword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movss(xmm2, dword[A-0x7c]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movaps(xmm4, xword[BO-0x80]);
    xorps(xmm12, xmm12);
    movaps(xmm5, xword[BO-0x70]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l246c, T_NEAR);
    sub(H, 0x1e);
    jle(l2358, T_NEAR);
    align(4);

L(l225c);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l225c, T_NEAR);
    align(4);

L(l2358);
    prefetcht0(byte[CO1+0x0]);
    prefetcht0(byte[CO1+LDC*1+0x0]);
    prefetcht0(byte[CO2]);
    prefetcht0(byte[CO2+LDC*1]);
    add(H, 0x1e);
    align(4);

L(l2370);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x60]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x50]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x40]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movaps(xmm5, xword[BO-0x30]);
    add(AA, 0x8);
    sub(BO, -64);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l2370, T_NEAR);
    align(4);

L(l246c);
    mov(H, K);
    and_(H, 0x3);
    je(l24c8, T_NEAR);
    align(4);

L(l2478);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movaps(xmm4, xword[BO-0x70]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x7c]);
    addps(xmm11, xmm6);
    sub(AO, -4);
    sub(BO, -16);
    dec(H);
    jg(l2478, T_NEAR);
    align(4);

L(l24c8);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movaps(xmm10, xmm1);
    movaps(xmm11, xmm1);
    shufps(xmm10, xmm0, 0xcc);
    shufps(xmm11, xmm0, 0x66);
    movss(dword[CO1+0x0], xmm8);
    movss(dword[CO1+LDC*1+0x0], xmm9);
    movss(dword[CO2], xmm10);
    movss(dword[CO2+LDC*1], xmm11);
    lea(CO1, ptr[CO1+LDC*4+0x0]);
    lea(CO2, ptr[CO2+LDC*4]);
    sub(I, 0x4);
    cmp(I, 0x4);
    jge(l2208, T_NEAR);
    align(4);

L(l2534);
    test(I, 0x2);
    jle(l2844, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movss(xmm0, dword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movss(xmm2, dword[A-0x7c]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movddup(xmm4, qword[BO-0x80]);
    xorps(xmm12, xmm12);
    movddup(xmm5, qword[BO-0x78]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l27a8, T_NEAR);
    sub(H, 0x1e);
    jle(l2698, T_NEAR);
    align(4);

L(l2598);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l2598, T_NEAR);
    align(4);

L(l2698);
    prefetcht0(byte[CO1+0x0]);
    prefetcht0(byte[CO1+LDC*1+0x0]);
    add(H, 0x1e);
    align(4);

L(l26a8);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x70]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x68]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x60]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movddup(xmm5, qword[BO-0x58]);
    add(AA, 0x8);
    sub(BO, -32);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l26a8, T_NEAR);
    align(4);

L(l27a8);
    mov(H, K);
    and_(H, 0x3);
    je(l2804, T_NEAR);
    align(4);

L(l27b4);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movddup(xmm4, qword[BO-0x78]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x7c]);
    addps(xmm11, xmm6);
    sub(AO, -4);
    sub(BO, -8);
    dec(H);
    jg(l27b4, T_NEAR);
    align(4);

L(l2804);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movss(dword[CO1+0x0], xmm8);
    movss(dword[CO1+LDC*1+0x0], xmm9);
    lea(CO1, ptr[CO1+LDC*2+0x0]);
    lea(CO2, ptr[CO2+LDC*2]);
    align(4);

L(l2844);
    test(I, 0x1);
    jle(l2b4c, T_NEAR);
    lea(CO2, ptr[CO1+LDC*2+0x0]);
    movss(xmm0, dword[A-0x80]);
    xorps(xmm8, xmm8);
    xorps(xmm9, xmm9);
    movss(xmm2, dword[A-0x7c]);
    xorps(xmm10, xmm10);
    xorps(xmm11, xmm11);
    movss(xmm4, dword[BO-0x80]);
    xorps(xmm12, xmm12);
    movss(xmm5, dword[BO-0x7c]);
    xorps(xmm13, xmm13);
    xorps(xmm14, xmm14);
    xorps(xmm15, xmm15);
    mov(AO, A);
    mov(H, K);
    sar(H, 0x2);
    jle(l2ab4, T_NEAR);
    sub(H, 0x1e);
    jle(l29a8, T_NEAR);
    align(4);

L(l28a8);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l28a8, T_NEAR);
    align(4);

L(l29a8);
    prefetcht0(byte[CO1+0x0]);
    add(H, 0x1e);
    align(4);

L(l29b4);
    prefetcht0(byte[AO+0x180]);
    prefetcht0(byte[BO+0x100]);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x78]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x78]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x74]);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x74]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x70]);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x70]);
    addps(xmm11, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm8, xmm5);
    pshufd(xmm5, xmm6, 0x1b);
    mulps(xmm6, xmm2);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm5, 0xb1);
    mulps(xmm5, xmm2);
    addps(xmm10, xmm5);
    movss(xmm5, dword[BO-0x6c]);
    add(AA, 0x8);
    sub(BO, -16);
    mulps(xmm6, xmm2);
    movss(xmm2, dword[AO-0x6c]);
    sub(AO, -16);
    addps(xmm11, xmm6);
    prefetcht0(byte[AA-0x78]);
    sub(H, 0x1);
    jg(l29b4, T_NEAR);
    align(4);

L(l2ab4);
    mov(H, K);
    and_(H, 0x3);
    je(l2b10, T_NEAR);
    align(4);

L(l2ac0);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm8, xmm4);
    pshufd(xmm4, xmm6, 0x1b);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    addps(xmm9, xmm6);
    pshufd(xmm6, xmm4, 0xb1);
    movaps(xmm7, xmm4);
    mulps(xmm4, xmm0);
    addps(xmm10, xmm4);
    movss(xmm4, dword[BO-0x7c]);
    movaps(xmm7, xmm6);
    mulps(xmm6, xmm0);
    movss(xmm0, dword[AO-0x7c]);
    addps(xmm11, xmm6);
    sub(AO, -4);
    sub(BO, -4);
    dec(H);
    jg(l2ac0, T_NEAR);
    align(4);

L(l2b10);
    movaps(xmm0, xmm8);
    unpcklpd(xmm8, xmm9);
    unpckhpd(xmm0, xmm9);
    movaps(xmm1, xmm10);
    unpckhpd(xmm10, xmm11);
    unpcklpd(xmm1, xmm11);
    movaps(xmm9, xmm8);
    shufps(xmm8, xmm10, 0xcc);
    shufps(xmm9, xmm10, 0x66);
    movss(dword[CO1+0x0], xmm8);
    lea(CO1, ptr[CO1+LDC*1+0x0]);
    lea(CO2, ptr[CO2+LDC*1]);
    align(4);

L(l2b4c);
    mov(A, AO);
    align(4);

L(l2b50);

    postamble();
}
outLocalLabel();

#undef M
#undef N
#undef K
#undef A
#undef B
#undef C
#undef LDC
#undef AA
#undef I
#undef J
#undef H
#undef AO
#undef BO
#undef CO1
#undef CO2
#ifdef _WIN32
#undef OLD_A
#undef OLD_B
#endif
#undef OLD_C
#undef OLD_LDC
}

}
}
}
